The first thing to do is to read in the data from the csv file.
library(readr)
library(prophet)
## Loading required package: Rcpp
## Loading required package: rlang
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(DT)
library(ggrepel)
library(leaflet)
library(leaflet.extras)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v tibble 2.1.3 v purrr 0.3.2
## v tidyr 1.0.0 v stringr 1.4.0
## v tibble 2.1.3 v forcats 0.4.0
## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x purrr::%@%() masks rlang::%@%()
## x purrr::as_function() masks rlang::as_function()
## x scales::col_factor() masks readr::col_factor()
## x purrr::discard() masks scales::discard()
## x dplyr::filter() masks stats::filter()
## x purrr::flatten() masks rlang::flatten()
## x purrr::flatten_chr() masks rlang::flatten_chr()
## x purrr::flatten_dbl() masks rlang::flatten_dbl()
## x purrr::flatten_int() masks rlang::flatten_int()
## x purrr::flatten_lgl() masks rlang::flatten_lgl()
## x purrr::flatten_raw() masks rlang::flatten_raw()
## x purrr::invoke() masks rlang::invoke()
## x dplyr::lag() masks stats::lag()
## x purrr::list_along() masks rlang::list_along()
## x purrr::modify() masks rlang::modify()
## x purrr::prepend() masks rlang::prepend()
## x MASS::select() masks dplyr::select()
## x purrr::splice() masks rlang::splice()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
setwd("C:/Users/Jeffo/Downloads")
crimedc<- read.csv('crimedc.csv') #read in from csv file
crimedc <- crimedc%>%
clean_names()
crimedc$ccn <- NULL
proper_case <- function(x) {
return (gsub("\\b([A-Z])([A-Z]+)", "\\U\\1\\L\\2" , x, perl=TRUE))
}
crimedc<- crimedc%>% mutate(block = proper_case(block))
#using the mutate function, create separate variables for each month, year and day
crimedc<-crimedc%>%
mutate(
incident_hour=hour(ymd_hms(report_dat)),
incidentdate=as.Date(report_dat),
incident_month=month(ymd_hms(report_dat)),
incident_year=year(ymd_hms(report_dat)),
atnight=incident_hour>6 & incident_hour<18,
burglary=offense==( 'BURGLARY')
)
dcburglary<-filter(crimedc, burglary==TRUE)
dcatnight<-filter(crimedc, atnight==TRUE)
nightburglary<-filter(dcatnight, burglary==TRUE)
There are 29 different variables in this dataset, as well as 26264 different observations. Most of the data variables are factor, and some are numerical variables.
str(crimedc)
## 'data.frame': 26264 obs. of 30 variables:
## $ i_x : num -77 -77 -77 -77 -77.1 ...
## $ y : num 38.9 38.9 38.9 38.9 38.9 ...
## $ report_dat : Factor w/ 26222 levels "2019-01-01T00:00:00.000Z",..: 24795 25139 24393 24955 24993 25193 25460 25462 25513 25337 ...
## $ shift : Factor w/ 3 levels "DAY","EVENING",..: 1 2 1 3 1 1 1 1 2 2 ...
## $ method : Factor w/ 3 levels "GUN","KNIFE",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ offense : Factor w/ 9 levels "ARSON","ASSAULT W/DANGEROUS WEAPON",..: 9 9 9 8 8 9 9 9 9 8 ...
## $ block : chr "700 - 799 Block Of Upshur Street Nw" "1650 - 1691 Block Of Lanier Place Nw" "700 - 799 Block Of 7TH Street Nw" "1100 - 1199 Block Of Park Street Ne" ...
## $ xblock : num 398050 396582 398098 400790 392391 ...
## $ yblock : num 141562 139810 136808 136106 141648 ...
## $ ward : int 4 1 2 6 3 1 1 4 6 3 ...
## $ anc : Factor w/ 40 levels "1A","1B","1C",..: 19 3 7 26 14 3 3 20 28 14 ...
## $ district : int NA NA NA NA NA NA NA NA NA NA ...
## $ psa : int NA NA NA NA NA NA NA NA NA NA ...
## $ neighborhood_cluster: Factor w/ 40 levels "","Cluster 1",..: 11 2 39 19 4 2 2 11 19 4 ...
## $ block_group : Factor w/ 451 levels "","000100 1",..: 120 168 224 324 46 176 173 109 329 46 ...
## $ census_tract : int 2400 3900 5800 8100 1001 4002 4001 2201 8301 1001 ...
## $ voting_precinct : Factor w/ 143 levels "Precinct 1","Precinct 10",..: 84 73 34 128 68 62 62 95 127 68 ...
## $ latitude : num 38.9 38.9 38.9 38.9 38.9 ...
## $ longitude : num -77 -77 -77 -77 -77.1 ...
## $ bid : Factor w/ 12 levels "","ADAMS MORGAN",..: 1 1 6 1 1 1 1 1 1 1 ...
## $ start_date : Factor w/ 26190 levels "2005-04-17T15:00:34.000Z",..: 24812 25071 24459 24923 25031 25059 25473 23790 25496 24916 ...
## $ end_date : Factor w/ 22330 levels "","2005-04-17T15:30:31.000Z",..: 21071 21350 20755 21171 21267 21423 21678 20379 21731 21176 ...
## $ objectid : int 352778328 352778329 352778330 352778331 352778332 352778333 352778334 352778335 352778336 352778337 ...
## $ octo_record_id : Factor w/ 26264 levels "17084415-01",..: 26250 26251 26252 26253 26254 26255 26256 26257 26258 26259 ...
## $ incident_hour : int 12 21 13 0 12 14 10 10 19 22 ...
## $ incidentdate : Date, format: "2019-10-02" "2019-10-05" ...
## $ incident_month : num 10 10 9 10 10 10 10 10 10 10 ...
## $ incident_year : num 2019 2019 2019 2019 2019 ...
## $ atnight : logi TRUE FALSE TRUE FALSE TRUE TRUE ...
## $ burglary : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
In this dataset, there are 9 unique offenses that are represented in the datset.Unfortunately, drugs is not represented in this dataset, so we will focus on burglary.
unique(crimedc$offense)
## [1] THEFT/OTHER THEFT F/AUTO
## [3] SEX ABUSE HOMICIDE
## [5] ASSAULT W/DANGEROUS WEAPON ROBBERY
## [7] BURGLARY MOTOR VEHICLE THEFT
## [9] ARSON
## 9 Levels: ARSON ASSAULT W/DANGEROUS WEAPON BURGLARY ... THEFT/OTHER
As you can see, there are no missing offense observations, which will make this dataset as accurate as possible.
sum(is.na(crimedc$offense))
## [1] 0
The crime I will mainly be focusing on in this dataset is burglary. As we can see, there are 1015 different observations for Burglary arrests.
table(crimedc$offense)[order(table(crimedc$offense), decreasing = T)] # Order decreasing
##
## THEFT/OTHER THEFT F/AUTO
## 11934 8208
## MOTOR VEHICLE THEFT ROBBERY
## 1783 1738
## ASSAULT W/DANGEROUS WEAPON BURGLARY
## 1277 1015
## SEX ABUSE HOMICIDE
## 166 136
## ARSON
## 7
Next I have a look at the frequency of crimes by month. Looking at our results, you can see that the most popular months for dc crimes are August and September.
group_by(.data = crimedc, incident_month) %>%
filter(!is.na(incident_month))%>%
summarise(count = n()) %>%
arrange(desc(count))
## # A tibble: 10 x 2
## incident_month count
## <dbl> <int>
## 1 8 3060
## 2 9 3033
## 3 7 3018
## 4 6 2863
## 5 1 2720
## 6 5 2717
## 7 4 2558
## 8 3 2438
## 9 2 2303
## 10 10 1554
Next I have a look at the frequency of burglary crimes by month. Looking at our results, you can see that the most popular months for dc burglary crimes are May and Janauary. Surprsingly, August is still in the top 3.Something interesting to note is that November and December there is no data.
group_by(.data = dcburglary, incident_month) %>%
filter(!is.na(incident_month))%>%
summarise(count = n()) %>%
arrange(desc(count))
## # A tibble: 10 x 2
## incident_month count
## <dbl> <int>
## 1 5 126
## 2 1 125
## 3 8 110
## 4 2 107
## 5 6 107
## 6 9 106
## 7 3 101
## 8 7 97
## 9 4 87
## 10 10 49
ggplot(data=crimedc,aes(x=reorder(ward,ward,function(x)-length(x))))+geom_bar()+labs(title="DC arrests by Ward", x="Ward #")+ theme(plot.title = element_text(hjust = 0.5))
For DC, there is a steady increase in arrests from Feburary till September, with its peak at August.
ggplot(data=crimedc, aes(x=incident_month))+geom_bar(aes(fill=atnight))+labs(title="Arrests in DC by Month")+ theme(plot.title = element_text(hjust = 0.5))+scale_x_discrete(name ="month",
limits=c("1","2","3","4","5","6","7","8","9","10","11", "12"))+scale_fill_discrete(name = "Time of Day", labels = c("Daytime", "Nightime"))
For DC, there is a steady increase in arrests from Feburary till September, with its peak at August.
ggplot(data=dcburglary, aes(x=incident_month))+geom_bar(aes(fill=atnight))+labs(title=" Burglary Arrests in DC by Month")+ theme(plot.title = element_text(hjust = 0.5))+scale_x_discrete(name ="month",
limits=c("1","2","3","4","5","6","7","8","9","10","11", "12"))+scale_fill_discrete(name = "Time of Day", labels = c("Daytime", "Nightime"))
DC has more arrests in the daytime vs the nightime.
P<-ggplot(data=subset(crimedc, !is.na(atnight)), aes(x=atnight)) + geom_bar(aes(fill=atnight))
P+scale_x_discrete(labels=c("FALSE" = "Daytime","TRUE" = "Nighttime"))+labs(title="DC Arrests in Daytime vs Nightime")+ theme(plot.title = element_text(hjust = 0.5))+scale_fill_discrete(name = "Time of Day", labels = c("Daytime", "Nightime"))+theme(legend.position = "none")
An interesting observation is that DC burglaries happen more at night, compared to MD burglaries that happen mostly in the daytime.
P<-ggplot(data=subset(dcburglary, !is.na(atnight)), aes(x=atnight)) + geom_bar(aes(fill=atnight))
P+scale_x_discrete(labels=c("FALSE" = "Daytime","TRUE" = "Nighttime"))+labs(title="DC Burglary Arrests in Daytime vs Nightime", x="Time of the day")+ theme(plot.title = element_text(hjust = 0.5))
#using the stamen mapping, create boundaries for the map
dc_bb <- c(left = -77.2413345,
bottom = 38.8171923,
right = -76.868707,
top = 39.1)
#get map outline
dc_stamen <- get_stamenmap(bbox = dc_bb,
zoom = 10)
## Source : http://tile.stamen.com/terrain/10/292/390.png
## Source : http://tile.stamen.com/terrain/10/293/390.png
## Source : http://tile.stamen.com/terrain/10/292/391.png
## Source : http://tile.stamen.com/terrain/10/293/391.png
## Source : http://tile.stamen.com/terrain/10/292/392.png
## Source : http://tile.stamen.com/terrain/10/293/392.png
#using ggmaps, create a map with the crimedc data
ggmap(dc_stamen)+ geom_point(data=crimedc, mapping = aes(x = longitude,
y = latitude, color=offense))
ggmap(dc_stamen)+ geom_point(data=nightburglary, mapping = aes(x = longitude,
y = latitude, color=offense))
leaflet(crimedc) %>% addProviderTiles(providers$CartoDB.Positron) %>%
addWebGLHeatmap(lng=~longitude, lat=~latitude, size = 200)
#Statistical Data Analysis Using the Chisquared test, I look to see if there are any correlation between method and shift.I set signifcance level at 0.05 Null hypothesis (H0): the row and the column variables of the contingency table are independent. Alternative hypothesis (H1): row and column variables are dependent
r<-table(crimedc$offense, crimedc$shift)
chisq<- chisq.test(crimedc$offense, crimedc$shift, correct=FALSE)
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
chisq
##
## Pearson's Chi-squared test
##
## data: crimedc$offense and crimedc$shift
## X-squared = 1908.3, df = 16, p-value < 2.2e-16
q<-qchisq(c(.025,.975),df=16, lower.tail=FALSE) #95 confidence interval
based on the fact that the pvalue much smaller than the 0.05 i chose, we can reject the null hypothesis and conclude that the variables offense and shift are statistically significantly associated.
chisq$observed
## crimedc$shift
## crimedc$offense DAY EVENING MIDNIGHT
## ARSON 2 4 1
## ASSAULT W/DANGEROUS WEAPON 226 498 553
## BURGLARY 446 299 270
## HOMICIDE 0 0 136
## MOTOR VEHICLE THEFT 763 630 390
## ROBBERY 316 641 781
## SEX ABUSE 48 84 34
## THEFT F/AUTO 3241 3175 1792
## THEFT/OTHER 3783 6070 2081
round(chisq$expected,2)
## crimedc$shift
## crimedc$offense DAY EVENING MIDNIGHT
## ARSON 2.35 3.04 1.61
## ASSAULT W/DANGEROUS WEAPON 429.09 554.34 293.58
## BURGLARY 341.05 440.60 233.34
## HOMICIDE 45.70 59.04 31.27
## MOTOR VEHICLE THEFT 599.11 773.99 409.91
## ROBBERY 583.99 754.45 399.56
## SEX ABUSE 55.78 72.06 38.16
## THEFT F/AUTO 2757.98 3563.03 1886.99
## THEFT/OTHER 4009.96 5180.46 2743.58
round(chisq$residuals, 3)
## crimedc$shift
## crimedc$offense DAY EVENING MIDNIGHT
## ARSON -0.230 0.551 -0.480
## ASSAULT W/DANGEROUS WEAPON -9.804 -2.393 15.141
## BURGLARY 5.683 -6.746 2.400
## HOMICIDE -6.760 -7.684 18.731
## MOTOR VEHICLE THEFT 6.696 -5.176 -0.983
## ROBBERY -11.090 -4.130 19.082
## SEX ABUSE -1.041 1.407 -0.674
## THEFT F/AUTO 9.197 -6.501 -2.187
## THEFT/OTHER -3.584 12.359 -12.650
library(corrplot)
## corrplot 0.84 loaded
corrplot(chisq$residuals, is.cor = FALSE)
r<-table(crimedc$ward, crimedc$shift)
chisq<- chisq.test(crimedc$ward, crimedc$shift, correct=FALSE)
chisq
##
## Pearson's Chi-squared test
##
## data: crimedc$ward and crimedc$shift
## X-squared = 530.4, df = 14, p-value < 2.2e-16
q<-qchisq(c(.025,.975),df=16, lower.tail=FALSE) #95 confidence interval
#Balloon Plot
# 1. convert the data as a table
dt <- as.table(as.matrix(r))
# 2. Graph
balloonplot(t(dt), main ="r", xlab ="", ylab="",
label = FALSE, show.margins = FALSE,colsrt=60,colmar=5)
chisq$observed
## crimedc$shift
## crimedc$ward DAY EVENING MIDNIGHT
## 1 1135 1544 1043
## 2 1523 2746 1270
## 3 531 859 158
## 4 786 886 309
## 5 1381 1516 991
## 6 1627 1926 910
## 7 1031 1110 766
## 8 811 814 591
round(chisq$expected,2)
## crimedc$shift
## crimedc$ward DAY EVENING MIDNIGHT
## 1 1250.63 1615.69 855.67
## 2 1861.17 2404.44 1273.40
## 3 520.15 671.97 355.88
## 4 665.64 859.94 455.42
## 5 1306.41 1687.75 893.84
## 6 1499.62 1937.35 1026.03
## 7 976.78 1261.91 668.31
## 8 744.60 961.95 509.45
round(chisq$residuals, 3)
## crimedc$shift
## crimedc$ward DAY EVENING MIDNIGHT
## 1 -3.270 -1.784 6.404
## 2 -7.839 6.966 -0.095
## 3 0.476 7.215 -10.489
## 4 4.665 0.889 -6.861
## 5 2.064 -4.181 3.250
## 6 3.289 -0.258 -3.622
## 7 1.735 -4.276 3.779
## 8 2.433 -4.770 3.613
library(corrplot)
corrplot(chisq$residuals, is.cor = FALSE)
# Conclusion
Looking at the correlation plot for the residuals, we can see that there are strong positive association between Evening and Ward 2 and 3. There is also a strong positive association between Ward 1 and Midnight. There is a strong negative assocation between Ward 3 and Midnight as well as Ward 2 and Day.
contrib <- 100*chisq$residuals^2/chisq$statistic
round(contrib, 3)
## crimedc$shift
## crimedc$ward DAY EVENING MIDNIGHT
## 1 2.016 0.600 7.732
## 2 11.584 9.148 0.002
## 3 0.043 9.814 20.744
## 4 4.103 0.149 8.876
## 5 0.803 3.295 1.991
## 6 2.040 0.013 2.474
## 7 0.567 3.448 2.692
## 8 1.116 4.290 2.461
corrplot(contrib, is.cor = FALSE)
df=crimedc%>%
count(incidentdate)%>%
rename(ds=incidentdate, y=n) %>%
filter(!is.na(ds),ds>as.Date("2019-07-01"))
m=prophet(df)
## Disabling yearly seasonality. Run prophet with yearly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
future <- make_future_dataframe(m, periods = 365)
forecast <- predict(m, future)
tail(forecast[c("ds", "yhat", "yhat_lower", "yhat_upper")])
## ds yhat yhat_lower yhat_upper
## 467 2020-10-10 110.1187 90.20261 128.3634
## 468 2020-10-11 102.2543 82.70361 121.3101
## 469 2020-10-12 114.4556 94.05422 133.8316
## 470 2020-10-13 103.5294 83.94160 122.8648
## 471 2020-10-14 98.7148 80.60278 118.1941
## 472 2020-10-15 102.3473 82.62391 120.1891
plot(m, forecast)
prophet_plot_components(m, forecast)
dyplot.prophet(m, forecast)
## Registered S3 method overwritten by 'xts':
## method from
## as.zoo.xts zoo